import numpy as np
import pandas as pd
# Visualisation libraries
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze the Heart Disease Dataset from the UCI Machine Learning Repository.

Picture Source: harvard.edu
The object of the exercise is to develop a predictive model that can predict whether heart disease is present or absent based on the rest of the given features.
Data = np.genfromtxt('heart-disease/heart.dat', delimiter=' ')
Attributes = ['Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure', 'Serum Cholestoral',
'Fasting Blood Sugar', 'Resting Electrocardiographic Results', 'Maximum Heart Rate Achieved',
'Exercise Induced Angina', 'Oldpeak', 'Slope',
'Number of Major Vessels', 'Thal', 'Heart Disease']
Data = pd.DataFrame(data = Data, columns = Attributes)
#
Temp = ['Sex', 'Chest Pain Type', 'Fasting Blood Sugar', 'Resting Electrocardiographic Results',
'Exercise Induced Angina', 'Slope', 'Number of Major Vessels','Thal']
for c in Temp:
Data[c] = Data[c].astype(int).astype(str)
del Temp, c
Target = 'Heart Disease'
Labels = ['Absent', 'Present']
Data['Heart Disease'] = (Data['Heart Disease']-1).astype(int).replace({0: Labels[0], 1: Labels[1]})
Data['Age'] = Data['Age'].astype(int)
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out ['Size'] = Inp.shape[0]
Out['Percentage'] = 100 - np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
Out.index.name = 'Features'
Out['Data Type'] = Out['Data Type'].astype(str)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
# Maps
Maps = {'Sex': {'0':'Female', '1':'Male'},
'Chest Pain Type': {'1':'Typical Angina', '2':'Atypical Angina', '3': 'Non-Anginal Pain', '4':'Asymptomatic'},
'Fasting Blood Sugar': {'0': 'False', '1': 'True'}, 'Exercise Induced Angina': {'0': 'No', '1': 'Yes'},
'Slope': {'1': 'Upsloping', '2': 'Flat', '3': 'Downsloping'},
'Thal': {'3': 'Normal', '6': 'Fixed Defect','7': 'Reversable Defect'}}
for c in list(Maps.keys()):
Data[c] = Data[c].replace(Maps[c])
del c
display(Data)
#
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
| Age | Sex | Chest Pain Type | Resting Blood Pressure | Serum Cholestoral | Fasting Blood Sugar | Resting Electrocardiographic Results | Maximum Heart Rate Achieved | Exercise Induced Angina | Oldpeak | Slope | Number of Major Vessels | Thal | Heart Disease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 70 | Male | Asymptomatic | 130.0 | 322.0 | False | 2 | 109.0 | No | 2.4 | Flat | 3 | Normal | Present |
| 1 | 67 | Female | Non-Anginal Pain | 115.0 | 564.0 | False | 2 | 160.0 | No | 1.6 | Flat | 0 | Reversable Defect | Absent |
| 2 | 57 | Male | Atypical Angina | 124.0 | 261.0 | False | 0 | 141.0 | No | 0.3 | Upsloping | 0 | Reversable Defect | Present |
| 3 | 64 | Male | Asymptomatic | 128.0 | 263.0 | False | 0 | 105.0 | Yes | 0.2 | Flat | 1 | Reversable Defect | Absent |
| 4 | 74 | Female | Atypical Angina | 120.0 | 269.0 | False | 2 | 121.0 | Yes | 0.2 | Upsloping | 1 | Normal | Absent |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 265 | 52 | Male | Non-Anginal Pain | 172.0 | 199.0 | True | 0 | 162.0 | No | 0.5 | Upsloping | 0 | Reversable Defect | Absent |
| 266 | 44 | Male | Atypical Angina | 120.0 | 263.0 | False | 0 | 173.0 | No | 0.0 | Upsloping | 0 | Reversable Defect | Absent |
| 267 | 56 | Female | Atypical Angina | 140.0 | 294.0 | False | 2 | 153.0 | No | 1.3 | Flat | 0 | Normal | Absent |
| 268 | 57 | Male | Asymptomatic | 140.0 | 192.0 | False | 0 | 148.0 | No | 0.4 | Flat | 0 | Fixed Defect | Absent |
| 269 | 67 | Male | Asymptomatic | 160.0 | 286.0 | False | 2 | 108.0 | Yes | 1.5 | Flat | 3 | Normal | Present |
270 rows × 14 columns
| Number of Instances | Number of Attributes |
|---|---|
| 270 | 14 |
data_info = Data_info(Data).reset_index(drop = False)
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightBlue', 'PeachPuff'], hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1, y=.5, traceorder="normal", bordercolor="DarkGray", borderwidth=1))
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
fig.show()
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
subplot_titles=('Age Distribution (Heart Disease = Absent)', 'Age Distribution (Heart Disease = Present)'))
Colors = ['Pink', 'BlueViolet']
LC = 'Black'
fig1 = px.histogram(Data.loc[Data['Heart Disease'] == 'Absent'], x = 'Age', color='Sex',
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
Colors = Colors[::-1]
fig2 = px.histogram(Data.loc[Data['Heart Disease'] == 'Present'], x = 'Age', color='Sex',
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig.update_traces(showlegend = False, row=1, col=2)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig['layout']['yaxis'].update(range=[0, 18])
fig.show()
Colors = ['OrangeRed', 'LimeGreen']
LC = 'Black'
fig = px.box(Data, x='Sex', y = 'Age', color = 'Heart Disease',
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.update_traces(quartilemethod='linear')
fig.update_layout(height= 500, width = 600, plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[20, 80])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig.update_layout(title={'text': 'Age Distribution and Gender',
'x':0.48, 'y':0.95, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Colors = ['OrangeRed', 'LimeGreen']
LC = 'Black'
fig = px.box(Data, x='Fasting Blood Sugar', y = 'Resting Blood Pressure', color = 'Heart Disease',
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.update_traces(quartilemethod='linear')
fig.update_layout(height= 500, width = 600, plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[80, 220])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig.update_layout(title={'text': 'Resting Blood Pressure Distribution and Fasting Blood Sugar',
'x':0.48, 'y':0.95, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Colors = ['OrangeRed', 'LimeGreen']
LC = 'Black'
fig = px.violin(Data, x='Fasting Blood Sugar', y = 'Serum Cholestoral', color = 'Heart Disease', box=True,
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.update_layout(height= 500, width = 600, plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 7e2])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig.update_layout(title={'text': 'Serum Cholestoral Distribution and Heart Disease',
'x':0.48, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
In this section, we demonstrate the relationship between the maximum heart rate achieved and heart disease.
Colors = ['OrangeRed', 'LimeGreen']
LC = 'Black'
fig = px.violin(Data, x='Chest Pain Type', y = 'Maximum Heart Rate Achieved', color = 'Heart Disease', box=True,
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.update_layout(height= 500, plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[50, 2.5e2])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig.update_layout(title={'text': 'Maximum Heart Rate Achieved and Chest Pain Type',
'x':0.48, 'y':0.95, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
fig = px.violin(Data, x = 'Maximum Heart Rate Achieved', y='Exercise Induced Angina', color = 'Heart Disease', box=True,
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.update_layout(height= 400, width = 750, plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 2.5e2])
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig.update_layout(title={'text': 'Maximum Heart Rate Achieved and Exercise Induced Angina',
'x':0.48, 'y':0.95, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Detrano, R., Janosi, A., Steinbrunn, W., Pfisterer, M., Schmid, J.J., Sandhu, S., Guppy, K.H., Lee, S. and Froelicher, V., 1989. International application of a new probability algorithm for the diagnosis of coronary artery disease. The American journal of cardiology, 64(5), pp.304-310.
Aha, D. and Kibler, D., 1988. Instance-based prediction of heart-disease presence with the Cleveland database. University of California, 3(1), pp.3-2.
Gennari, J.H., Langley, P. and Fisher, D., 1989. Models of incremental concept formation. Artificial intelligence, 40(1-3), pp.11-61.